! ---
! Copyright (C) 1996-2016	The SIESTA group
!  This file is distributed under the terms of the
!  GNU General Public License: see COPYING in the top directory
!  or http://www.gnu.org/copyleft/gpl.txt .
! See Docs/Contributors.txt for a list of contributors.
! ---

!*********************************************************************
!        MODULE fft1d
!        Just a wrapper for routines nfft, gpfa, and setgpfa
!
!*********************************************************************
!
!        subroutine nfft( N )
!
!        CHANGES N INTO THE NEXT INTEGER ALLOWED BY THE GPFA ROUTINE
!        WRITTEN BY J.M.SOLER. MAY/95.
!
!        Input and output:
!        integer:: N    ! Candidate FFT vector size, increased to
!                         next allowed value, if necessary
!
!*********************************************************************
!
!        SUBROUTINE 'GPFA'
!        SELF-SORTING IN-PLACE GENERALIZED PRIME FACTOR (COMPLEX) FFT
!
!        *** THIS IS THE ALL-FORTRAN VERSION ***
!            -------------------------------
!
!        CALL GPFA(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN)
!
!        A IS FIRST REAL INPUT/OUTPUT VECTOR
!        B IS FIRST IMAGINARY INPUT/OUTPUT VECTOR
!        TRIGS IS A TABLE OF TWIDDLE FACTORS, PRECALCULATED
!              BY CALLING SUBROUTINE 'SETGPFA'
!        INC IS THE INCREMENT WITHIN EACH DATA VECTOR
!        JUMP IS THE INCREMENT BETWEEN DATA VECTORS
!        N IS THE LENGTH OF THE TRANSFORMS:
!          -----------------------------------
!            N = (2**IP) * (3**IQ) * (5**IR)
!          -----------------------------------
!        LOT IS THE NUMBER OF TRANSFORMS
!        ISIGN = +1 FOR FORWARD TRANSFORM
!              = -1 FOR INVERSE TRANSFORM
!
!        WRITTEN BY CLIVE TEMPERTON
!        RECHERCHE EN PREVISION NUMERIQUE
!        ATMOSPHERIC ENVIRONMENT SERVICE, CANADA
!
!----------------------------------------------------------------------
!
!        DEFINITION OF TRANSFORM
!        -----------------------
!
!        X(J) = SUM(K=0,...,N-1)(C(K)*EXP(ISIGN*2*I*J*K*PI/N))
!
!---------------------------------------------------------------------
!
!        FOR A MATHEMATICAL DEVELOPMENT OF THE ALGORITHM USED,
!        SEE:
!
!        C TEMPERTON : "A GENERALIZED PRIME FACTOR FFT ALGORITHM
!          FOR ANY N = (2**P)(3**Q)(5**R)",
!          SIAM J. SCI. STAT. COMP., MAY 1992.
!
!*********************************************************************
!
!        SUBROUTINE 'SETGPFA'
!        SETUP ROUTINE FOR SELF-SORTING IN-PLACE
!            GENERALIZED PRIME FACTOR (COMPLEX) FFT [GPFA]
!
!        CALL SETGPFA(TRIGS,N)
!
!        INPUT :
!        -----
!        N IS THE LENGTH OF THE TRANSFORMS. N MUST BE OF THE FORM:
!          -----------------------------------
!            N = (2**IP) * (3**IQ) * (5**IR)
!          -----------------------------------
!
!        OUTPUT:
!        ------
!        TRIGS IS A TABLE OF TWIDDLE FACTORS,
!          OF LENGTH 2*IPQR (REAL) WORDS, WHERE:
!          --------------------------------------
!            IPQR = (2**IP) + (3**IQ) + (5**IR)
!          --------------------------------------
!
!        WRITTEN BY CLIVE TEMPERTON 1990
!
!*********************************************************************

      MODULE fft1d

! Used module procedures:
      USE sys,       only: die     ! Termination routine

! Used module parameters, variables, and arrays:
      USE precision, only: dp      ! Double precision real kind
      USE precision, only: grid_p  ! Real kind used in grid arrays
      USE parallel,  only: ionode  ! Is this the IO processor?

      implicit none

! Public module procedures:
      PUBLIC::
     .  nfft,    ! Finds the next integer allowed by the FFT routine
     .  gpfa,    ! 1D FFT
     .  setgpfa  ! Initializes gpfa routine

! Public module parameters, variables, and arrays:
!     none

      PRIVATE ! Nothing is declared public below this point

      CONTAINS

!*********************************************************************

      SUBROUTINE nfft( N )

C CHANGES N INTO THE NEXT INTEGER ALLOWED BY THE FFT ROUTINE
C WRITTEN BY J.M.SOLER. MAY/95.

      integer :: np, nmax, nmin, n, nrem, ip
      parameter (NP = 3, NMAX = 1000000)
      integer :: IPRIME(NP)
      data IPRIME / 2, 3, 5 /

      NMIN = N
      DO N = NMIN, NMAX
        NREM = N
        DO IP = 1,NP
   10     CONTINUE 
          IF ( MODULO( NREM, IPRIME(IP) ) .EQ. 0 ) THEN
            NREM = NREM / IPRIME(IP)
            GOTO 10
          ENDIF
        ENDDO
        IF (NREM .EQ. 1) RETURN
      ENDDO
      if (Ionode)
     $    write(6,*) 'NFFT: NO SUITABLE INTEGER FOUND FOR N =', NMIN
      call die("see above")
      END SUBROUTINE nfft


!*********************************************************************

      SUBROUTINE GPFA(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN)

      IMPLICIT NONE
*
      REAL(grid_p) ::     A(*), B(*) 
      REAL(dp)  TRIGS(*)
      INTEGER INC, JUMP, N, LOT, ISIGN
C
      INTEGER 
     .  NJ(3), NN, IFAC, LL, KK, IP, IQ, IR, I
*
*     DECOMPOSE N INTO FACTORS 2,3,5
*     ------------------------------
      NN = N
      IFAC = 2
*
      DO 30 LL = 1 , 3
      KK = 0
   10 CONTINUE
      IF (MOD(NN,IFAC).NE.0) GO TO 20
      KK = KK + 1
      NN = NN / IFAC
      GO TO 10
   20 CONTINUE
      NJ(LL) = KK
      IFAC = IFAC + LL
   30 CONTINUE
*
      IF (NN.NE.1) THEN
         WRITE(6,40) N
   40    FORMAT(' *** WARNING!!!',I10,' IS NOT A LEGAL VALUE OF N ***')
         RETURN
      ENDIF
*
      IP = NJ(1)
      IQ = NJ(2)
      IR = NJ(3)
*
*     COMPUTE THE TRANSFORM
*     ---------------------
      I = 1
      IF (IP.GT.0) THEN
         CALL GPFA2F(A,B,TRIGS,INC,JUMP,N,IP,LOT,ISIGN)
         I = I + 2 * ( 2**IP)
      ENDIF
      IF (IQ.GT.0) THEN
         CALL GPFA3F(A,B,TRIGS(I),INC,JUMP,N,IQ,LOT,ISIGN)
         I = I + 2 * (3**IQ)
      ENDIF
      IF (IR.GT.0) THEN
         CALL GPFA5F(A,B,TRIGS(I),INC,JUMP,N,IR,LOT,ISIGN)
      ENDIF
*
      RETURN
      END SUBROUTINE GPFA


!*********************************************************************

*     fortran version of *gpfa2* -
*     radix-2 section of self-sorting, in-place, generalized pfa
*     central radix-2 and radix-8 passes included
*      so that transform length can be any power of 2
*

      subroutine gpfa2f(a,b,trigs,inc,jump,n,mm,lot,isign)

      real(grid_p) ::     a(*), b(*)
      real(dp)     ::     trigs(*)
      integer inc, jump, n, mm, lot, isign

      integer  n2, inq, jstepx, ninc, ink, m2, m8,
     $         m, mh, nblox, left, istart, nb, nvex,
     $         la, mu, ipass, jstep, jstepl, jjj, ja,
     $         nu, jb, jc, jd, j, l, kk, k, je, jf,
     $         jg, jh, laincl, ll, ji, jj, jk, jl, jm,
     $         jn, jo, jp

      real(dp) :: s, aja, ajc, t0, t2, ajb, ajd,
     $                t1, t3, bja, bjc, u0, u2, bjb,
     $                bjd, u1, u3, co1, si1, co2, si2,
     $                co3, si3, ss, c1, c2, c3, aje,
     $                ajf, ajg, ajh, bje, bjf, bjg,
     $                bjh, co4, si4, co5, si5, co6, si6,
     $                co7, si7, aji, bjm, ajj, bjj, ajk,
     $                ajl, bji, bjk, ajo, bjl, bjo, ajm,
     $                ajn, ajp, bjn, bjp

#ifdef OLD_CRAY
      integer, parameter :: lvr = CRAY_LVR_PARAMETER
#else
      integer, parameter :: lvr = 1024
#endif
*
*     ***************************************************************
*     *                                                             *
*     *  N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. *
*     *  RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE *
*     *  (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER.      *
*     *                                                             *
*     ***************************************************************
*
      n2 = 2**mm
      inq = n/n2
      jstepx = (n2-n) * inc
      ninc = n * inc
      ink = inc * inq
*
      m2 = 0
      m8 = 0
      if (mod(mm,2).eq.0) then
         m = mm/2
      else if (mod(mm,4).eq.1) then
         m = (mm-1)/2
         m2 = 1
      else if (mod(mm,4).eq.3) then
         m = (mm-3)/2
         m8 = 1
      endif
      mh = (m+1)/2
*
      nblox = 1 + (lot-1)/lvr
      left = lot
      s = float(isign)
      istart = 1
*
*  loop on blocks of lvr transforms
*  --------------------------------
      do 500 nb = 1 , nblox
*
      if (left.le.lvr) then
         nvex = left
      else if (left.lt.(2*lvr)) then
         nvex = left/2
         nvex = nvex + mod(nvex,2)
      else
         nvex = lvr
      endif
      left = left - nvex
*
      la = 1
*
*  loop on type I radix-4 passes
*  -----------------------------
      mu = mod(inq,4)
      if (isign.eq.-1) mu = 4 - mu
      ss = 1.0
      if (mu.eq.3) ss = -1.0
*
      if (mh.eq.0) go to 200
*
      do 160 ipass = 1 , mh
      jstep = (n*inc) / (4*la)
      jstepl = jstep - ninc
*
*  k = 0 loop (no twiddle factors)
*  -------------------------------
      do 120 jjj = 0 , (n-1)*inc , 4*jstep
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 115 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = jc + jstepl
      if (jd.lt.istart) jd = jd + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 110 l = 1 , nvex
      aja = a(ja+j)
      ajc = a(jc+j)
      t0 = aja + ajc
      t2 = aja - ajc
      ajb = a(jb+j)
      ajd = a(jd+j)
      t1 = ajb + ajd
      t3 = ss * ( ajb - ajd )
      bja = b(ja+j)
      bjc = b(jc+j)
      u0 = bja + bjc
      u2 = bja - bjc
      bjb = b(jb+j)
      bjd = b(jd+j)
      u1 = bjb + bjd
      u3 = ss * ( bjb - bjd )
      a(ja+j) = t0 + t1
      a(jc+j) = t0 - t1
      b(ja+j) = u0 + u1
      b(jc+j) = u0 - u1
      a(jb+j) = t2 - u3
      a(jd+j) = t2 + u3
      b(jb+j) = u2 + t3
      b(jd+j) = u2 - t3
      j = j + jump
  110 continue
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  115 continue
  120 continue
*
*  finished if n2 = 4
*  ------------------
      if (n2.eq.4) go to 490
      kk = 2 * la
*
*  loop on nonzero k
*  -----------------
      do 150 k = ink , jstep-ink , ink
      co1 = trigs(kk+1)
      si1 = s*trigs(kk+2)
      co2 = trigs(2*kk+1)
      si2 = s*trigs(2*kk+2)
      co3 = trigs(3*kk+1)
      si3 = s*trigs(3*kk+2)
*
*  loop along transform
*  --------------------
      do 140 jjj = k , (n-1)*inc , 4*jstep
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 135 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = jc + jstepl
      if (jd.lt.istart) jd = jd + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 130 l = 1 , nvex
      aja = a(ja+j)
      ajc = a(jc+j)
      t0 = aja + ajc
      t2 = aja - ajc
      ajb = a(jb+j)
      ajd = a(jd+j)
      t1 = ajb + ajd
      t3 = ss * ( ajb - ajd )
      bja = b(ja+j)
      bjc = b(jc+j)
      u0 = bja + bjc
      u2 = bja - bjc
      bjb = b(jb+j)
      bjd = b(jd+j)
      u1 = bjb + bjd
      u3 = ss * ( bjb - bjd )
      a(ja+j) = t0 + t1
      b(ja+j) = u0 + u1
      a(jb+j) = co1*(t2-u3) - si1*(u2+t3)
      b(jb+j) = si1*(t2-u3) + co1*(u2+t3)
      a(jc+j) = co2*(t0-t1) - si2*(u0-u1)
      b(jc+j) = si2*(t0-t1) + co2*(u0-u1)
      a(jd+j) = co3*(t2+u3) - si3*(u2-t3)
      b(jd+j) = si3*(t2+u3) + co3*(u2-t3)
      j = j + jump
  130 continue
*-----( end of loop across transforms )
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  135 continue
  140 continue
*-----( end of loop along transforms )
      kk = kk + 2*la
  150 continue
*-----( end of loop on nonzero k )
      la = 4*la
  160 continue
*-----( end of loop on type I radix-4 passes)
*
*  central radix-2 pass
*  --------------------
  200 continue
      if (m2.eq.0) go to 300
*
      jstep = (n*inc) / (2*la)
      jstepl = jstep - ninc
*
*  k=0 loop (no twiddle factors)
*  -----------------------------
      do 220 jjj = 0 , (n-1)*inc , 2*jstep
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 215 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 210 l = 1 , nvex
      aja = a(ja+j)
      ajb = a(jb+j)
      t0 = aja - ajb
      a(ja+j) = aja + ajb
      a(jb+j) = t0
      bja = b(ja+j)
      bjb = b(jb+j)
      u0 = bja - bjb
      b(ja+j) = bja + bjb
      b(jb+j) = u0
      j = j + jump
  210 continue
*-----(end of loop across transforms)
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  215 continue
  220 continue
*
*  finished if n2=2
*  ----------------
      if (n2.eq.2) go to 490
*
      kk = 2 * la
*
*  loop on nonzero k
*  -----------------
      do 260 k = ink , jstep - ink , ink
      co1 = trigs(kk+1)
      si1 = s*trigs(kk+2)
*
*  loop along transforms
*  ---------------------
      do 250 jjj = k , (n-1)*inc , 2*jstep
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 245 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
      if (kk.eq.n2/2) then
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 230 l = 1 , nvex
      aja = a(ja+j)
      ajb = a(jb+j)
      t0 = ss * ( aja - ajb )
      a(ja+j) = aja + ajb
      bjb = b(jb+j)
      bja = b(ja+j)
      a(jb+j) = ss * ( bjb - bja )
      b(ja+j) = bja + bjb
      b(jb+j) = t0
      j = j + jump
  230 continue
*
      else
*
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 240 l = 1 , nvex
      aja = a(ja+j)
      ajb = a(jb+j)
      t0 = aja - ajb
      a(ja+j) = aja + ajb
      bja = b(ja+j)
      bjb = b(jb+j)
      u0 = bja - bjb
      b(ja+j) = bja + bjb
      a(jb+j) = co1*t0 - si1*u0
      b(jb+j) = si1*t0 + co1*u0
      j = j + jump
  240 continue
*
      endif
*
*-----(end of loop across transforms)
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  245 continue
  250 continue
*-----(end of loop along transforms)
      kk = kk + 2 * la
  260 continue
*-----(end of loop on nonzero k)
*-----(end of radix-2 pass)
*
      la = 2 * la
      go to 400
*
*  central radix-8 pass
*  --------------------
  300 continue
      if (m8.eq.0) go to 400
      jstep = (n*inc) / (8*la)
      jstepl = jstep - ninc
      mu = mod(inq,8)
      if (isign.eq.-1) mu = 8 - mu
      c1 = 1.0
      if (mu.eq.3.or.mu.eq.7) c1 = -1.0
      c2 = sqrt(0.5)
      if (mu.eq.3.or.mu.eq.5) c2 = -c2
      c3 = c1 * c2
*
*  stage 1
*  -------
      do 320 k = 0 , jstep - ink , ink
      do 315 jjj = k , (n-1)*inc , 8*jstep
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 312 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = jc + jstepl
      if (jd.lt.istart) jd = jd + ninc
      je = jd + jstepl
      if (je.lt.istart) je = je + ninc
      jf = je + jstepl
      if (jf.lt.istart) jf = jf + ninc
      jg = jf + jstepl
      if (jg.lt.istart) jg = jg + ninc
      jh = jg + jstepl
      if (jh.lt.istart) jh = jh + ninc
      j = 0
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 310 l = 1 , nvex
      aja = a(ja+j)
      aje = a(je+j)
      t0 = aja - aje
      a(ja+j) = aja + aje
      ajc = a(jc+j)
      ajg = a(jg+j)
      t1 = c1 * ( ajc - ajg )
      a(je+j) = ajc + ajg
      ajb = a(jb+j)
      ajf = a(jf+j)
      t2 = ajb - ajf
      a(jc+j) = ajb + ajf
      ajd = a(jd+j)
      ajh = a(jh+j)
      t3 = ajd - ajh
      a(jg+j) = ajd + ajh
      a(jb+j) = t0
      a(jf+j) = t1
      a(jd+j) = c2 * ( t2 - t3 )
      a(jh+j) = c3 * ( t2 + t3 )
      bja = b(ja+j)
      bje = b(je+j)
      u0 = bja - bje
      b(ja+j) = bja + bje
      bjc = b(jc+j)
      bjg = b(jg+j)
      u1 = c1 * ( bjc - bjg )
      b(je+j) = bjc + bjg
      bjb = b(jb+j)
      bjf = b(jf+j)
      u2 = bjb - bjf
      b(jc+j) = bjb + bjf
      bjd = b(jd+j)
      bjh = b(jh+j)
      u3 = bjd - bjh
      b(jg+j) = bjd + bjh
      b(jb+j) = u0
      b(jf+j) = u1
      b(jd+j) = c2 * ( u2 - u3 )
      b(jh+j) = c3 * ( u2 + u3 )
      j = j + jump
  310 continue
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  312 continue
  315 continue
  320 continue
*
*  stage 2
*  -------
*
*  k=0 (no twiddle factors)
*  ------------------------
      do 330 jjj = 0 , (n-1)*inc , 8*jstep
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 328 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = jc + jstepl
      if (jd.lt.istart) jd = jd + ninc
      je = jd + jstepl
      if (je.lt.istart) je = je + ninc
      jf = je + jstepl
      if (jf.lt.istart) jf = jf + ninc
      jg = jf + jstepl
      if (jg.lt.istart) jg = jg + ninc
      jh = jg + jstepl
      if (jh.lt.istart) jh = jh + ninc
      j = 0
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 325 l = 1 , nvex
      aja = a(ja+j)
      aje = a(je+j)
      t0 = aja + aje
      t2 = aja - aje
      ajc = a(jc+j)
      ajg = a(jg+j)
      t1 = ajc + ajg
      t3 = c1 * ( ajc - ajg )
      bja = b(ja+j)
      bje = b(je+j)
      u0 = bja + bje
      u2 = bja - bje
      bjc = b(jc+j)
      bjg = b(jg+j)
      u1 = bjc + bjg
      u3 = c1 * ( bjc - bjg )
      a(ja+j) = t0 + t1
      a(je+j) = t0 - t1
      b(ja+j) = u0 + u1
      b(je+j) = u0 - u1
      a(jc+j) = t2 - u3
      a(jg+j) = t2 + u3
      b(jc+j) = u2 + t3
      b(jg+j) = u2 - t3
      ajb = a(jb+j)
      ajd = a(jd+j)
      t0 = ajb + ajd
      t2 = ajb - ajd
      ajf = a(jf+j)
      ajh = a(jh+j)
      t1 = ajf - ajh
      t3 = ajf + ajh
      bjb = b(jb+j)
      bjd = b(jd+j)
      u0 = bjb + bjd
      u2 = bjb - bjd
      bjf = b(jf+j)
      bjh = b(jh+j)
      u1 = bjf - bjh
      u3 = bjf + bjh
      a(jb+j) = t0 - u3
      a(jh+j) = t0 + u3
      b(jb+j) = u0 + t3
      b(jh+j) = u0 - t3
      a(jd+j) = t2 + u1
      a(jf+j) = t2 - u1
      b(jd+j) = u2 - t1
      b(jf+j) = u2 + t1
      j = j + jump
  325 continue
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  328 continue
  330 continue
*
      if (n2.eq.8) go to 490
*
*  loop on nonzero k
*  -----------------
      kk = 2 * la
*
      do 350 k = ink , jstep - ink , ink
*
      co1 = trigs(kk+1)
      si1 = s * trigs(kk+2)
      co2 = trigs(2*kk+1)
      si2 = s * trigs(2*kk+2)
      co3 = trigs(3*kk+1)
      si3 = s * trigs(3*kk+2)
      co4 = trigs(4*kk+1)
      si4 = s * trigs(4*kk+2)
      co5 = trigs(5*kk+1)
      si5 = s * trigs(5*kk+2)
      co6 = trigs(6*kk+1)
      si6 = s * trigs(6*kk+2)
      co7 = trigs(7*kk+1)
      si7 = s * trigs(7*kk+2)
*
      do 345 jjj = k , (n-1)*inc , 8*jstep
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 342 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = jc + jstepl
      if (jd.lt.istart) jd = jd + ninc
      je = jd + jstepl
      if (je.lt.istart) je = je + ninc
      jf = je + jstepl
      if (jf.lt.istart) jf = jf + ninc
      jg = jf + jstepl
      if (jg.lt.istart) jg = jg + ninc
      jh = jg + jstepl
      if (jh.lt.istart) jh = jh + ninc
      j = 0
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 340 l = 1 , nvex
      aja = a(ja+j)
      aje = a(je+j)
      t0 = aja + aje
      t2 = aja - aje
      ajc = a(jc+j)
      ajg = a(jg+j)
      t1 = ajc + ajg
      t3 = c1 * ( ajc - ajg )
      bja = b(ja+j)
      bje = b(je+j)
      u0 = bja + bje
      u2 = bja - bje
      bjc = b(jc+j)
      bjg = b(jg+j)
      u1 = bjc + bjg
      u3 = c1 * ( bjc - bjg )
      a(ja+j) = t0 + t1
      b(ja+j) = u0 + u1
      a(je+j) = co4*(t0-t1) - si4*(u0-u1)
      b(je+j) = si4*(t0-t1) + co4*(u0-u1)
      a(jc+j) = co2*(t2-u3) - si2*(u2+t3)
      b(jc+j) = si2*(t2-u3) + co2*(u2+t3)
      a(jg+j) = co6*(t2+u3) - si6*(u2-t3)
      b(jg+j) = si6*(t2+u3) + co6*(u2-t3)
      ajb = a(jb+j)
      ajd = a(jd+j)
      t0 = ajb + ajd
      t2 = ajb - ajd
      ajf = a(jf+j)
      ajh = a(jh+j)
      t1 = ajf - ajh
      t3 = ajf + ajh
      bjb = b(jb+j)
      bjd = b(jd+j)
      u0 = bjb + bjd
      u2 = bjb - bjd
      bjf = b(jf+j)
      bjh = b(jh+j)
      u1 = bjf - bjh
      u3 = bjf + bjh
      a(jb+j) = co1*(t0-u3) - si1*(u0+t3)
      b(jb+j) = si1*(t0-u3) + co1*(u0+t3)
      a(jh+j) = co7*(t0+u3) - si7*(u0-t3)
      b(jh+j) = si7*(t0+u3) + co7*(u0-t3)
      a(jd+j) = co3*(t2+u1) - si3*(u2-t1)
      b(jd+j) = si3*(t2+u1) + co3*(u2-t1)
      a(jf+j) = co5*(t2-u1) - si5*(u2+t1)
      b(jf+j) = si5*(t2-u1) + co5*(u2+t1)
      j = j + jump
  340 continue
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  342 continue
  345 continue
      kk = kk + 2 * la
  350 continue
*
      la = 8 * la
*
*  loop on type II radix-4 passes
*  ------------------------------
  400 continue
      mu = mod(inq,4)
      if (isign.eq.-1) mu = 4 - mu
      ss = 1.0
      if (mu.eq.3) ss = -1.0
*
      do 480 ipass = mh+1 , m
      jstep = (n*inc) / (4*la)
      jstepl = jstep - ninc
      laincl = la * ink - ninc
*
*  k=0 loop (no twiddle factors)
*  -----------------------------
      do 430 ll = 0 , (la-1)*ink , 4*jstep
*
      do 420 jjj = ll , (n-1)*inc , 4*la*ink
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 415 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = jc + jstepl
      if (jd.lt.istart) jd = jd + ninc
      je = ja + laincl
      if (je.lt.istart) je = je + ninc
      jf = je + jstepl
      if (jf.lt.istart) jf = jf + ninc
      jg = jf + jstepl
      if (jg.lt.istart) jg = jg + ninc
      jh = jg + jstepl
      if (jh.lt.istart) jh = jh + ninc
      ji = je + laincl
      if (ji.lt.istart) ji = ji + ninc
      jj = ji + jstepl
      if (jj.lt.istart) jj = jj + ninc
      jk = jj + jstepl
      if (jk.lt.istart) jk = jk + ninc
      jl = jk + jstepl
      if (jl.lt.istart) jl = jl + ninc
      jm = ji + laincl
      if (jm.lt.istart) jm = jm + ninc
      jn = jm + jstepl
      if (jn.lt.istart) jn = jn + ninc
      jo = jn + jstepl
      if (jo.lt.istart) jo = jo + ninc
      jp = jo + jstepl
      if (jp.lt.istart) jp = jp + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 410 l = 1 , nvex
      aja = a(ja+j)
      ajc = a(jc+j)
      t0 = aja + ajc
      t2 = aja - ajc
      ajb = a(jb+j)
      ajd = a(jd+j)
      t1 = ajb + ajd
      t3 = ss * ( ajb - ajd )
      aji = a(ji+j)
      ajc =  aji
      bja = b(ja+j)
      bjc = b(jc+j)
      u0 = bja + bjc
      u2 = bja - bjc
      bjb = b(jb+j)
      bjd = b(jd+j)
      u1 = bjb + bjd
      u3 = ss * ( bjb - bjd )
      aje = a(je+j)
      ajb =  aje
      a(ja+j) = t0 + t1
      a(ji+j) = t0 - t1
      b(ja+j) = u0 + u1
      bjc =  u0 - u1
      bjm = b(jm+j)
      bjd =  bjm
      a(je+j) = t2 - u3
      ajd =  t2 + u3
      bjb =  u2 + t3
      b(jm+j) = u2 - t3
*----------------------
      ajg = a(jg+j)
      t0 = ajb + ajg
      t2 = ajb - ajg
      ajf = a(jf+j)
      ajh = a(jh+j)
      t1 = ajf + ajh
      t3 = ss * ( ajf - ajh )
      ajj = a(jj+j)
      ajg =  ajj
      bje = b(je+j)
      bjg = b(jg+j)
      u0 = bje + bjg
      u2 = bje - bjg
      bjf = b(jf+j)
      bjh = b(jh+j)
      u1 = bjf + bjh
      u3 = ss * ( bjf - bjh )
      b(je+j) = bjb
      a(jb+j) = t0 + t1
      a(jj+j) = t0 - t1
      bjj = b(jj+j)
      bjg =  bjj
      b(jb+j) = u0 + u1
      b(jj+j) = u0 - u1
      a(jf+j) = t2 - u3
      ajh =  t2 + u3
      b(jf+j) = u2 + t3
      bjh =  u2 - t3
*----------------------
      ajk = a(jk+j)
      t0 = ajc + ajk
      t2 = ajc - ajk
      ajl = a(jl+j)
      t1 = ajg + ajl
      t3 = ss * ( ajg - ajl )
      bji = b(ji+j)
      bjk = b(jk+j)
      u0 = bji + bjk
      u2 = bji - bjk
      ajo = a(jo+j)
      ajl =  ajo
      bjl = b(jl+j)
      u1 = bjg + bjl
      u3 = ss * ( bjg - bjl )
      b(ji+j) = bjc
      a(jc+j) = t0 + t1
      a(jk+j) = t0 - t1
      bjo = b(jo+j)
      bjl =  bjo
      b(jc+j) = u0 + u1
      b(jk+j) = u0 - u1
      a(jg+j) = t2 - u3
      a(jo+j) = t2 + u3
      b(jg+j) = u2 + t3
      b(jo+j) = u2 - t3
*----------------------
      ajm = a(jm+j)
      t0 = ajm + ajl
      t2 = ajm - ajl
      ajn = a(jn+j)
      ajp = a(jp+j)
      t1 = ajn + ajp
      t3 = ss * ( ajn - ajp )
      a(jm+j) = ajd
      u0 = bjd + bjl
      u2 = bjd - bjl
      bjn = b(jn+j)
      bjp = b(jp+j)
      u1 = bjn + bjp
      u3 = ss * ( bjn - bjp )
      a(jn+j) = ajh
      a(jd+j) = t0 + t1
      a(jl+j) = t0 - t1
      b(jd+j) = u0 + u1
      b(jl+j) = u0 - u1
      b(jn+j) = bjh
      a(jh+j) = t2 - u3
      a(jp+j) = t2 + u3
      b(jh+j) = u2 + t3
      b(jp+j) = u2 - t3
      j = j + jump
  410 continue
*-----( end of loop across transforms )
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  415 continue
  420 continue
  430 continue
*-----( end of double loop for k=0 )
*
*  finished if last pass
*  ---------------------
      if (ipass.eq.m) go to 490
*
      kk = 2*la
*
*     loop on nonzero k
*     -----------------
      do 470 k = ink , jstep-ink , ink
      co1 = trigs(kk+1)
      si1 = s*trigs(kk+2)
      co2 = trigs(2*kk+1)
      si2 = s*trigs(2*kk+2)
      co3 = trigs(3*kk+1)
      si3 = s*trigs(3*kk+2)
*
*  double loop along first transform in block
*  ------------------------------------------
      do 460 ll = k , (la-1)*ink , 4*jstep
*
      do 450 jjj = ll , (n-1)*inc , 4*la*ink
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 445 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = jc + jstepl
      if (jd.lt.istart) jd = jd + ninc
      je = ja + laincl
      if (je.lt.istart) je = je + ninc
      jf = je + jstepl
      if (jf.lt.istart) jf = jf + ninc
      jg = jf + jstepl
      if (jg.lt.istart) jg = jg + ninc
      jh = jg + jstepl
      if (jh.lt.istart) jh = jh + ninc
      ji = je + laincl
      if (ji.lt.istart) ji = ji + ninc
      jj = ji + jstepl
      if (jj.lt.istart) jj = jj + ninc
      jk = jj + jstepl
      if (jk.lt.istart) jk = jk + ninc
      jl = jk + jstepl
      if (jl.lt.istart) jl = jl + ninc
      jm = ji + laincl
      if (jm.lt.istart) jm = jm + ninc
      jn = jm + jstepl
      if (jn.lt.istart) jn = jn + ninc
      jo = jn + jstepl
      if (jo.lt.istart) jo = jo + ninc
      jp = jo + jstepl
      if (jp.lt.istart) jp = jp + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 440 l = 1 , nvex
      aja = a(ja+j)
      ajc = a(jc+j)
      t0 = aja + ajc
      t2 = aja - ajc
      ajb = a(jb+j)
      ajd = a(jd+j)
      t1 = ajb + ajd
      t3 = ss * ( ajb - ajd )
      aji = a(ji+j)
      ajc =  aji
      bja = b(ja+j)
      bjc = b(jc+j)
      u0 = bja + bjc
      u2 = bja - bjc
      bjb = b(jb+j)
      bjd = b(jd+j)
      u1 = bjb + bjd
      u3 = ss * ( bjb - bjd )
      aje = a(je+j)
      ajb =  aje
      a(ja+j) = t0 + t1
      b(ja+j) = u0 + u1
      a(je+j) = co1*(t2-u3) - si1*(u2+t3)
      bjb =  si1*(t2-u3) + co1*(u2+t3)
      bjm = b(jm+j)
      bjd =  bjm
      a(ji+j) = co2*(t0-t1) - si2*(u0-u1)
      bjc =  si2*(t0-t1) + co2*(u0-u1)
      ajd =  co3*(t2+u3) - si3*(u2-t3)
      b(jm+j) = si3*(t2+u3) + co3*(u2-t3)
*----------------------------------------
      ajg = a(jg+j)
      t0 = ajb + ajg
      t2 = ajb - ajg
      ajf = a(jf+j)
      ajh = a(jh+j)
      t1 = ajf + ajh
      t3 = ss * ( ajf - ajh )
      ajj = a(jj+j)
      ajg =  ajj
      bje = b(je+j)
      bjg = b(jg+j)
      u0 = bje + bjg
      u2 = bje - bjg
      bjf = b(jf+j)
      bjh = b(jh+j)
      u1 = bjf + bjh
      u3 = ss * ( bjf - bjh )
      b(je+j) = bjb
      a(jb+j) = t0 + t1
      b(jb+j) = u0 + u1
      bjj = b(jj+j)
      bjg =  bjj
      a(jf+j) = co1*(t2-u3) - si1*(u2+t3)
      b(jf+j) = si1*(t2-u3) + co1*(u2+t3)
      a(jj+j) = co2*(t0-t1) - si2*(u0-u1)
      b(jj+j) = si2*(t0-t1) + co2*(u0-u1)
      ajh =  co3*(t2+u3) - si3*(u2-t3)
      bjh =  si3*(t2+u3) + co3*(u2-t3)
*----------------------------------------
      ajk = a(jk+j)
      t0 = ajc + ajk
      t2 = ajc - ajk
      ajl = a(jl+j)
      t1 = ajg + ajl
      t3 = ss * ( ajg - ajl )
      bji = b(ji+j)
      bjk = b(jk+j)
      u0 = bji + bjk
      u2 = bji - bjk
      ajo = a(jo+j)
      ajl =  ajo
      bjl = b(jl+j)
      u1 = bjg + bjl
      u3 = ss * ( bjg - bjl )
      b(ji+j) = bjc
      a(jc+j) = t0 + t1
      b(jc+j) = u0 + u1
      bjo = b(jo+j)
      bjl =  bjo
      a(jg+j) = co1*(t2-u3) - si1*(u2+t3)
      b(jg+j) = si1*(t2-u3) + co1*(u2+t3)
      a(jk+j) = co2*(t0-t1) - si2*(u0-u1)
      b(jk+j) = si2*(t0-t1) + co2*(u0-u1)
      a(jo+j) = co3*(t2+u3) - si3*(u2-t3)
      b(jo+j) = si3*(t2+u3) + co3*(u2-t3)
*----------------------------------------
      ajm = a(jm+j)
      t0 = ajm + ajl
      t2 = ajm - ajl
      ajn = a(jn+j)
      ajp = a(jp+j)
      t1 = ajn + ajp
      t3 = ss * ( ajn - ajp )
      a(jm+j) = ajd
      u0 = bjd + bjl
      u2 = bjd - bjl
      a(jn+j) = ajh
      bjn = b(jn+j)
      bjp = b(jp+j)
      u1 = bjn + bjp
      u3 = ss * ( bjn - bjp )
      b(jn+j) = bjh
      a(jd+j) = t0 + t1
      b(jd+j) = u0 + u1
      a(jh+j) = co1*(t2-u3) - si1*(u2+t3)
      b(jh+j) = si1*(t2-u3) + co1*(u2+t3)
      a(jl+j) = co2*(t0-t1) - si2*(u0-u1)
      b(jl+j) = si2*(t0-t1) + co2*(u0-u1)
      a(jp+j) = co3*(t2+u3) - si3*(u2-t3)
      b(jp+j) = si3*(t2+u3) + co3*(u2-t3)
      j = j + jump
  440 continue
*-----(end of loop across transforms)
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  445 continue
  450 continue
  460 continue
*-----( end of double loop for this k )
      kk = kk + 2*la
  470 continue
*-----( end of loop over values of k )
      la = 4*la
  480 continue
*-----( end of loop on type II radix-4 passes )
*-----( nvex transforms completed)
  490 continue
      istart = istart + nvex * jump
  500 continue
*-----( end of loop on blocks of transforms )
*
      return
      end subroutine gpfa2f

!******************************************************************

*     fortran version of *gpfa3* -
*     radix-3 section of self-sorting, in-place
*        generalized PFA
*
*-------------------------------------------------------------------
*
      subroutine gpfa3f(a,b,trigs,inc,jump,n,mm,lot,isign)

      real(grid_p) ::     a(*), b(*)
      real(dp)     ::     trigs(*)
      integer inc, jump, n, mm, lot, isign

      real(dp), parameter :: sin60 = 0.866025403784437_dp

      integer      :: je, jf, jg, jh, laincl, ll, ji
      integer      :: n3, inq, jstepx, ninc, ink, mu, m, mh,
     $                nblox, left, istart, nb, nvex, la,
     $                ipass, jstep, jstepl, jjj, ja, nu, jb,
     $                jc, j, l, kk, k, jd

      real(dp) :: s, ajb, ajc, t1, aja, t2, t3, bjb, bjc,
     $                u1, bja, u2, u3, co1, si1, co2, si2, ajd,
     $                bjd
      real(dp) :: c1, aje, ajg, ajf, ajh, bje,
     $                bjf, bjg, bjh, aji, bji


      integer, parameter :: lvr = 128
*
*     ***************************************************************
*     *                                                             *
*     *  N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. *
*     *  RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE *
*     *  (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER.      *
*     *                                                             *
*     ***************************************************************
*
      n3 = 3**mm
      inq = n/n3
      jstepx = (n3-n) * inc
      ninc = n * inc
      ink = inc * inq
      mu = mod(inq,3)
      if (isign.eq.-1) mu = 3-mu
      m = mm
      mh = (m+1)/2
      s = float(isign)
      c1 = sin60
      if (mu.eq.2) c1 = -c1
*
      nblox = 1 + (lot-1)/lvr
      left = lot
      s = float(isign)
      istart = 1
*
*  loop on blocks of lvr transforms
*  --------------------------------
      do 500 nb = 1 , nblox
*
      if (left.le.lvr) then
         nvex = left
      else if (left.lt.(2*lvr)) then
         nvex = left/2
         nvex = nvex + mod(nvex,2)
      else
         nvex = lvr
      endif
      left = left - nvex
*
      la = 1
*
*  loop on type I radix-3 passes
*  -----------------------------
      do 160 ipass = 1 , mh
      jstep = (n*inc) / (3*la)
      jstepl = jstep - ninc
*
*  k = 0 loop (no twiddle factors)
*  -------------------------------
      do 120 jjj = 0 , (n-1)*inc , 3*jstep
      ja = istart + jjj
*
*  "transverse" loop
*  -----------------
      do 115 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 110 l = 1 , nvex
      ajb = a(jb+j)
      ajc = a(jc+j)
      t1 = ajb + ajc
      aja = a(ja+j)
      t2 = aja - 0.5 * t1
      t3 = c1 * ( ajb - ajc )
      bjb = b(jb+j)
      bjc = b(jc+j)
      u1 = bjb + bjc
      bja = b(ja+j)
      u2 = bja - 0.5 * u1
      u3 = c1 * ( bjb - bjc )
      a(ja+j) = aja + t1
      b(ja+j) = bja + u1
      a(jb+j) = t2 - u3
      b(jb+j) = u2 + t3
      a(jc+j) = t2 + u3
      b(jc+j) = u2 - t3
      j = j + jump
  110 continue
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  115 continue
  120 continue
*
*  finished if n3 = 3
*  ------------------
      if (n3.eq.3) go to 490
      kk = 2 * la
*
*  loop on nonzero k
*  -----------------
      do 150 k = ink , jstep-ink , ink
      co1 = trigs(kk+1)
      si1 = s*trigs(kk+2)
      co2 = trigs(2*kk+1)
      si2 = s*trigs(2*kk+2)
*
*  loop along transform
*  --------------------
      do 140 jjj = k , (n-1)*inc , 3*jstep
      ja = istart + jjj
*
*  "transverse" loop
*  -----------------
      do 135 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 130 l = 1 , nvex
      ajb = a(jb+j)
      ajc = a(jc+j)
      t1 = ajb + ajc
      aja = a(ja+j)
      t2 = aja - 0.5 * t1
      t3 = c1 * ( ajb - ajc )
      bjb = b(jb+j)
      bjc = b(jc+j)
      u1 = bjb + bjc
      bja = b(ja+j)
      u2 = bja - 0.5 * u1
      u3 = c1 * ( bjb - bjc )
      a(ja+j) = aja + t1
      b(ja+j) = bja + u1
      a(jb+j) = co1*(t2-u3) - si1*(u2+t3)
      b(jb+j) = si1*(t2-u3) + co1*(u2+t3)
      a(jc+j) = co2*(t2+u3) - si2*(u2-t3)
      b(jc+j) = si2*(t2+u3) + co2*(u2-t3)
      j = j + jump
  130 continue
*-----( end of loop across transforms )
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  135 continue
  140 continue
*-----( end of loop along transforms )
      kk = kk + 2*la
  150 continue
*-----( end of loop on nonzero k )
      la = 3*la
  160 continue
*-----( end of loop on type I radix-3 passes)
*
*  loop on type II radix-3 passes
*  ------------------------------
  400 continue
*
      do 480 ipass = mh+1 , m
      jstep = (n*inc) / (3*la)
      jstepl = jstep - ninc
      laincl = la*ink - ninc
*
*  k=0 loop (no twiddle factors)
*  -----------------------------
      do 430 ll = 0 , (la-1)*ink , 3*jstep
*
      do 420 jjj = ll , (n-1)*inc , 3*la*ink
      ja = istart + jjj
*
*  "transverse" loop
*  -----------------
      do 415 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = ja + laincl
      if (jd.lt.istart) jd = jd + ninc
      je = jd + jstepl
      if (je.lt.istart) je = je + ninc
      jf = je + jstepl
      if (jf.lt.istart) jf = jf + ninc
      jg = jd + laincl
      if (jg.lt.istart) jg = jg + ninc
      jh = jg + jstepl
      if (jh.lt.istart) jh = jh + ninc
      ji = jh + jstepl
      if (ji.lt.istart) ji = ji + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 410 l = 1 , nvex
      ajb = a(jb+j)
      ajc = a(jc+j)
      t1 = ajb + ajc
      aja = a(ja+j)
      t2 = aja - 0.5 * t1
      t3 = c1 * ( ajb - ajc )
      ajd = a(jd+j)
      ajb =  ajd
      bjb = b(jb+j)
      bjc = b(jc+j)
      u1 = bjb + bjc
      bja = b(ja+j)
      u2 = bja - 0.5 * u1
      u3 = c1 * ( bjb - bjc )
      bjd = b(jd+j)
      bjb =  bjd
      a(ja+j) = aja + t1
      b(ja+j) = bja + u1
      a(jd+j) = t2 - u3
      b(jd+j) = u2 + t3
      ajc =  t2 + u3
      bjc =  u2 - t3
*----------------------
      aje = a(je+j)
      ajf = a(jf+j)
      t1 = aje + ajf
      t2 = ajb - 0.5 * t1
      t3 = c1 * ( aje - ajf )
      ajh = a(jh+j)
      ajf =  ajh
      bje = b(je+j)
      bjf = b(jf+j)
      u1 = bje + bjf
      u2 = bjb - 0.5 * u1
      u3 = c1 * ( bje - bjf )
      bjh = b(jh+j)
      bjf =  bjh
      a(jb+j) = ajb + t1
      b(jb+j) = bjb + u1
      a(je+j) = t2 - u3
      b(je+j) = u2 + t3
      a(jh+j) = t2 + u3
      b(jh+j) = u2 - t3
*----------------------
      aji = a(ji+j)
      t1 = ajf + aji
      ajg = a(jg+j)
      t2 = ajg - 0.5 * t1
      t3 = c1 * ( ajf - aji )
      t1 = ajg + t1
      a(jg+j) = ajc
      bji = b(ji+j)
      u1 = bjf + bji
      bjg = b(jg+j)
      u2 = bjg - 0.5 * u1
      u3 = c1 * ( bjf - bji )
      u1 = bjg + u1
      b(jg+j) = bjc
      a(jc+j) = t1
      b(jc+j) = u1
      a(jf+j) = t2 - u3
      b(jf+j) = u2 + t3
      a(ji+j) = t2 + u3
      b(ji+j) = u2 - t3
      j = j + jump
  410 continue
*-----( end of loop across transforms )
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  415 continue
  420 continue
  430 continue
*-----( end of double loop for k=0 )
*
*  finished if last pass
*  ---------------------
      if (ipass.eq.m) go to 490
*
      kk = 2*la
*
*     loop on nonzero k
*     -----------------
      do 470 k = ink , jstep-ink , ink
      co1 = trigs(kk+1)
      si1 = s*trigs(kk+2)
      co2 = trigs(2*kk+1)
      si2 = s*trigs(2*kk+2)
*
*  double loop along first transform in block
*  ------------------------------------------
      do 460 ll = k , (la-1)*ink , 3*jstep
*
      do 450 jjj = ll , (n-1)*inc , 3*la*ink
      ja = istart + jjj
*
*  "transverse" loop
*  -----------------
      do 445 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = ja + laincl
      if (jd.lt.istart) jd = jd + ninc
      je = jd + jstepl
      if (je.lt.istart) je = je + ninc
      jf = je + jstepl
      if (jf.lt.istart) jf = jf + ninc
      jg = jd + laincl
      if (jg.lt.istart) jg = jg + ninc
      jh = jg + jstepl
      if (jh.lt.istart) jh = jh + ninc
      ji = jh + jstepl
      if (ji.lt.istart) ji = ji + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 440 l = 1 , nvex
      ajb = a(jb+j)
      ajc = a(jc+j)
      t1 = ajb + ajc
      aja = a(ja+j)
      t2 = aja - 0.5 * t1
      t3 = c1 * ( ajb - ajc )
      ajd = a(jd+j)
      ajb =  ajd
      bjb = b(jb+j)
      bjc = b(jc+j)
      u1 = bjb + bjc
      bja = b(ja+j)
      u2 = bja - 0.5 * u1
      u3 = c1 * ( bjb - bjc )
      bjd = b(jd+j)
      bjb =  bjd
      a(ja+j) = aja + t1
      b(ja+j) = bja + u1
      a(jd+j) = co1*(t2-u3) - si1*(u2+t3)
      b(jd+j) = si1*(t2-u3) + co1*(u2+t3)
      ajc =  co2*(t2+u3) - si2*(u2-t3)
      bjc =  si2*(t2+u3) + co2*(u2-t3)
*----------------------
      aje = a(je+j)
      ajf = a(jf+j)
      t1 = aje + ajf
      t2 = ajb - 0.5 * t1
      t3 = c1 * ( aje - ajf )
      ajh = a(jh+j)
      ajf =  ajh
      bje = b(je+j)
      bjf = b(jf+j)
      u1 = bje + bjf
      u2 = bjb - 0.5 * u1
      u3 = c1 * ( bje - bjf )
      bjh = b(jh+j)
      bjf =  bjh
      a(jb+j) = ajb + t1
      b(jb+j) = bjb + u1
      a(je+j) = co1*(t2-u3) - si1*(u2+t3)
      b(je+j) = si1*(t2-u3) + co1*(u2+t3)
      a(jh+j) = co2*(t2+u3) - si2*(u2-t3)
      b(jh+j) = si2*(t2+u3) + co2*(u2-t3)
*----------------------
      aji = a(ji+j)
      t1 = ajf + aji
      ajg = a(jg+j)
      t2 = ajg - 0.5 * t1
      t3 = c1 * ( ajf - aji )
      t1 = ajg + t1
      a(jg+j) = ajc
      bji = b(ji+j)
      u1 = bjf + bji
      bjg = b(jg+j)
      u2 = bjg - 0.5 * u1
      u3 = c1 * ( bjf - bji )
      u1 = bjg + u1
      b(jg+j) = bjc
      a(jc+j) = t1
      b(jc+j) = u1
      a(jf+j) = co1*(t2-u3) - si1*(u2+t3)
      b(jf+j) = si1*(t2-u3) + co1*(u2+t3)
      a(ji+j) = co2*(t2+u3) - si2*(u2-t3)
      b(ji+j) = si2*(t2+u3) + co2*(u2-t3)
      j = j + jump
  440 continue
*-----(end of loop across transforms)
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  445 continue
  450 continue
  460 continue
*-----( end of double loop for this k )
      kk = kk + 2*la
  470 continue
*-----( end of loop over values of k )
      la = 3*la
  480 continue
*-----( end of loop on type II radix-3 passes )
*-----( nvex transforms completed)
  490 continue
      istart = istart + nvex * jump
  500 continue
*-----( end of loop on blocks of transforms )
*
      return
      end subroutine gpfa3f

!*******************************************************************

*     fortran version of *gpfa5* -
*     radix-5 section of self-sorting, in-place,
*        generalized pfa
*
*-------------------------------------------------------------------
*
      subroutine gpfa5f(a,b,trigs,inc,jump,n,mm,lot,isign)
      use precision, only : dp, grid_p

      real(grid_p) ::     a(*), b(*)
      real(dp)     ::     trigs(*)
      integer inc, jump, n, mm, lot, isign

      real(dp), parameter :: sin36 = 0.587785252292473_dp,
     $                       sin72 = 0.951056516295154_dp,
     $                       qrt5  = 0.559016994374947_dp
      integer, parameter  :: lvr = 128

      integer      ::  inq, jstepx, ninc, ink, mu, m, mh,
     $                 nblox, n5, left, istart, nb, nvex, la,
     $                 ipass, jstep, jstepl, kk, k, jjj, ja,
     $                 nu, jb, jc, jd, je, j, laincl, ll, l,
     $                 jf, jg, jh, ji, jj, jk, jl, jm, jn, jo,
     $                 jp, jq, jr, js, jt, ju, jv, jw, jx, jy

      real(dp) ::  s, c1, c2, c3, co1, si1, co2, si2, co3,
     $                 si3, co4, si4, ajb, aje, t1, ajc, ajd,
     $                 t2, t3, t4, t5, t6, aja, t7, t8, t9, t10,
     $                 t11, bjb, bje, u1, bjc, bjd, u2, u3, u4,
     $                 u5, u6, bja, u7, u8, u9, u10, u11, ajf,
     $                 ajk, bjf, bjk, ajg, ajj, ajh, aji, ajl,
     $                 ajq, bjg, bjj, bjh, bji, bjl, bjq, ajo,
     $                 ajm, ajn, ajr, ajw, bjo, bjm, bjn, bjr,
     $                 bjw, ajt, ajs, ajx, ajp, ax, bjt, bjs,
     $                 bjx, bjp, bx, ajv, ajy, aju, bjv, bjy,
     $                 bju
*
*     ***************************************************************
*     *                                                             *
*     *  N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. *
*     *  RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE *
*     *  (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER.      *
*     *                                                             *
*     ***************************************************************
*
      n5 = 5 ** mm
      inq = n / n5
      jstepx = (n5-n) * inc
      ninc = n * inc
      ink = inc * inq
      mu = mod(inq,5)
      if (isign.eq.-1) mu = 5 - mu
*
      m = mm
      mh = (m+1)/2
      s = float(isign)
      c1 = qrt5
      c2 = sin72
      c3 = sin36
      if (mu.eq.2.or.mu.eq.3) then
         c1 = -c1
         c2 = sin36
         c3 = sin72
      endif
      if (mu.eq.3.or.mu.eq.4) c2 = -c2
      if (mu.eq.2.or.mu.eq.4) c3 = -c3
*
      nblox = 1 + (lot-1)/lvr
      left = lot
      s = float(isign)
      istart = 1
*
*  loop on blocks of lvr transforms
*  --------------------------------
      do 500 nb = 1 , nblox
*
      if (left.le.lvr) then
         nvex = left
      else if (left.lt.(2*lvr)) then
         nvex = left/2
         nvex = nvex + mod(nvex,2)
      else
         nvex = lvr
      endif
      left = left - nvex
*
      la = 1
*
*  loop on type I radix-5 passes
*  -----------------------------
      do 160 ipass = 1 , mh
      jstep = (n*inc) / (5*la)
      jstepl = jstep - ninc
      kk = 0
*
*  loop on k
*  ---------
      do 150 k = 0 , jstep-ink , ink
*
      if (k.gt.0) then
      co1 = trigs(kk+1)
      si1 = s*trigs(kk+2)
      co2 = trigs(2*kk+1)
      si2 = s*trigs(2*kk+2)
      co3 = trigs(3*kk+1)
      si3 = s*trigs(3*kk+2)
      co4 = trigs(4*kk+1)
      si4 = s*trigs(4*kk+2)
      endif
*
*  loop along transform
*  --------------------
      do 140 jjj = k , (n-1)*inc , 5*jstep
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 135 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = jc + jstepl
      if (jd.lt.istart) jd = jd + ninc
      je = jd + jstepl
      if (je.lt.istart) je = je + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
      if (k.eq.0) then
*
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 110 l = 1 , nvex
      ajb = a(jb+j)
      aje = a(je+j)
      t1 = ajb + aje
      ajc = a(jc+j)
      ajd = a(jd+j)
      t2 = ajc + ajd
      t3 = ajb - aje
      t4 = ajc - ajd
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      aja = a(ja+j)
      t7 = aja - 0.25 * t5
      a(ja+j) = aja + t5
      t8 = t7 + t6
      t9 = t7 - t6
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      bjb = b(jb+j)
      bje = b(je+j)
      u1 = bjb + bje
      bjc = b(jc+j)
      bjd = b(jd+j)
      u2 = bjc + bjd
      u3 = bjb - bje
      u4 = bjc - bjd
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      bja = b(ja+j)
      u7 = bja - 0.25 * u5
      b(ja+j) = bja + u5
      u8 = u7 + u6
      u9 = u7 - u6
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      a(jb+j) = t8 - u11
      b(jb+j) = u8 + t11
      a(je+j) = t8 + u11
      b(je+j) = u8 - t11
      a(jc+j) = t9 - u10
      b(jc+j) = u9 + t10
      a(jd+j) = t9 + u10
      b(jd+j) = u9 - t10
      j = j + jump
  110 continue
*
      else
*
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 130 l = 1 , nvex
      ajb = a(jb+j)
      aje = a(je+j)
      t1 = ajb + aje
      ajc = a(jc+j)
      ajd = a(jd+j)
      t2 = ajc + ajd
      t3 = ajb - aje
      t4 = ajc - ajd
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      aja = a(ja+j)
      t7 = aja - 0.25 * t5
      a(ja+j) = aja + t5
      t8 = t7 + t6
      t9 = t7 - t6
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      bjb = b(jb+j)
      bje = b(je+j)
      u1 = bjb + bje
      bjc = b(jc+j)
      bjd = b(jd+j)
      u2 = bjc + bjd
      u3 = bjb - bje
      u4 = bjc - bjd
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      bja = b(ja+j)
      u7 = bja - 0.25 * u5
      b(ja+j) = bja + u5
      u8 = u7 + u6
      u9 = u7 - u6
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      a(jb+j) = co1*(t8-u11) - si1*(u8+t11)
      b(jb+j) = si1*(t8-u11) + co1*(u8+t11)
      a(je+j) = co4*(t8+u11) - si4*(u8-t11)
      b(je+j) = si4*(t8+u11) + co4*(u8-t11)
      a(jc+j) = co2*(t9-u10) - si2*(u9+t10)
      b(jc+j) = si2*(t9-u10) + co2*(u9+t10)
      a(jd+j) = co3*(t9+u10) - si3*(u9-t10)
      b(jd+j) = si3*(t9+u10) + co3*(u9-t10)
      j = j + jump
  130 continue
*
      endif
*
*-----( end of loop across transforms )
*
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  135 continue
  140 continue
*-----( end of loop along transforms )
      kk = kk + 2*la
  150 continue
*-----( end of loop on nonzero k )
      la = 5*la
  160 continue
*-----( end of loop on type I radix-5 passes)
*
      if (n.eq.5) go to 490
*
*  loop on type II radix-5 passes
*  ------------------------------
  400 continue
*
      do 480 ipass = mh+1 , m
      jstep = (n*inc) / (5*la)
      jstepl = jstep - ninc
      laincl = la * ink - ninc
      kk = 0
*
*     loop on k
*     ---------
      do 470 k = 0 , jstep-ink , ink
*
      if (k.gt.0) then
      co1 = trigs(kk+1)
      si1 = s*trigs(kk+2)
      co2 = trigs(2*kk+1)
      si2 = s*trigs(2*kk+2)
      co3 = trigs(3*kk+1)
      si3 = s*trigs(3*kk+2)
      co4 = trigs(4*kk+1)
      si4 = s*trigs(4*kk+2)
      endif
*
*  double loop along first transform in block
*  ------------------------------------------
      do 460 ll = k , (la-1)*ink , 5*jstep
*
      do 450 jjj = ll , (n-1)*inc , 5*la*ink
      ja = istart + jjj
*
*     "transverse" loop
*     -----------------
      do 445 nu = 1 , inq
      jb = ja + jstepl
      if (jb.lt.istart) jb = jb + ninc
      jc = jb + jstepl
      if (jc.lt.istart) jc = jc + ninc
      jd = jc + jstepl
      if (jd.lt.istart) jd = jd + ninc
      je = jd + jstepl
      if (je.lt.istart) je = je + ninc
      jf = ja + laincl
      if (jf.lt.istart) jf = jf + ninc
      jg = jf + jstepl
      if (jg.lt.istart) jg = jg + ninc
      jh = jg + jstepl
      if (jh.lt.istart) jh = jh + ninc
      ji = jh + jstepl
      if (ji.lt.istart) ji = ji + ninc
      jj = ji + jstepl
      if (jj.lt.istart) jj = jj + ninc
      jk = jf + laincl
      if (jk.lt.istart) jk = jk + ninc
      jl = jk + jstepl
      if (jl.lt.istart) jl = jl + ninc
      jm = jl + jstepl
      if (jm.lt.istart) jm = jm + ninc
      jn = jm + jstepl
      if (jn.lt.istart) jn = jn + ninc
      jo = jn + jstepl
      if (jo.lt.istart) jo = jo + ninc
      jp = jk + laincl
      if (jp.lt.istart) jp = jp + ninc
      jq = jp + jstepl
      if (jq.lt.istart) jq = jq + ninc
      jr = jq + jstepl
      if (jr.lt.istart) jr = jr + ninc
      js = jr + jstepl
      if (js.lt.istart) js = js + ninc
      jt = js + jstepl
      if (jt.lt.istart) jt = jt + ninc
      ju = jp + laincl
      if (ju.lt.istart) ju = ju + ninc
      jv = ju + jstepl
      if (jv.lt.istart) jv = jv + ninc
      jw = jv + jstepl
      if (jw.lt.istart) jw = jw + ninc
      jx = jw + jstepl
      if (jx.lt.istart) jx = jx + ninc
      jy = jx + jstepl
      if (jy.lt.istart) jy = jy + ninc
      j = 0
*
*  loop across transforms
*  ----------------------
      if (k.eq.0) then
*
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 410 l = 1 , nvex
      ajb = a(jb+j)
      aje = a(je+j)
      t1 = ajb + aje
      ajc = a(jc+j)
      ajd = a(jd+j)
      t2 = ajc + ajd
      t3 = ajb - aje
      t4 = ajc - ajd
      ajf = a(jf+j)
      ajb =  ajf
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      aja = a(ja+j)
      t7 = aja - 0.25 * t5
      a(ja+j) = aja + t5
      t8 = t7 + t6
      t9 = t7 - t6
      ajk = a(jk+j)
      ajc =  ajk
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      bjb = b(jb+j)
      bje = b(je+j)
      u1 = bjb + bje
      bjc = b(jc+j)
      bjd = b(jd+j)
      u2 = bjc + bjd
      u3 = bjb - bje
      u4 = bjc - bjd
      bjf = b(jf+j)
      bjb =  bjf
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      bja = b(ja+j)
      u7 = bja - 0.25 * u5
      b(ja+j) = bja + u5
      u8 = u7 + u6
      u9 = u7 - u6
      bjk = b(jk+j)
      bjc =  bjk
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      a(jf+j) = t8 - u11
      b(jf+j) = u8 + t11
      aje =  t8 + u11
      bje =  u8 - t11
      a(jk+j) = t9 - u10
      b(jk+j) = u9 + t10
      ajd =  t9 + u10
      bjd =  u9 - t10
*----------------------
      ajg = a(jg+j)
      ajj = a(jj+j)
      t1 = ajg + ajj
      ajh = a(jh+j)
      aji = a(ji+j)
      t2 = ajh + aji
      t3 = ajg - ajj
      t4 = ajh - aji
      ajl = a(jl+j)
      ajh =  ajl
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      t7 = ajb - 0.25 * t5
      a(jb+j) = ajb + t5
      t8 = t7 + t6
      t9 = t7 - t6
      ajq = a(jq+j)
      aji =  ajq
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      bjg = b(jg+j)
      bjj = b(jj+j)
      u1 = bjg + bjj
      bjh = b(jh+j)
      bji = b(ji+j)
      u2 = bjh + bji
      u3 = bjg - bjj
      u4 = bjh - bji
      bjl = b(jl+j)
      bjh =  bjl
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      u7 = bjb - 0.25 * u5
      b(jb+j) = bjb + u5
      u8 = u7 + u6
      u9 = u7 - u6
      bjq = b(jq+j)
      bji =  bjq
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      a(jg+j) = t8 - u11
      b(jg+j) = u8 + t11
      ajj =  t8 + u11
      bjj =  u8 - t11
      a(jl+j) = t9 - u10
      b(jl+j) = u9 + t10
      a(jq+j) = t9 + u10
      b(jq+j) = u9 - t10
*----------------------
      ajo = a(jo+j)
      t1 = ajh + ajo
      ajm = a(jm+j)
      ajn = a(jn+j)
      t2 = ajm + ajn
      t3 = ajh - ajo
      t4 = ajm - ajn
      ajr = a(jr+j)
      ajn =  ajr
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      t7 = ajc - 0.25 * t5
      a(jc+j) = ajc + t5
      t8 = t7 + t6
      t9 = t7 - t6
      ajw = a(jw+j)
      ajo =  ajw
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      bjo = b(jo+j)
      u1 = bjh + bjo
      bjm = b(jm+j)
      bjn = b(jn+j)
      u2 = bjm + bjn
      u3 = bjh - bjo
      u4 = bjm - bjn
      bjr = b(jr+j)
      bjn =  bjr
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      u7 = bjc - 0.25 * u5
      b(jc+j) = bjc + u5
      u8 = u7 + u6
      u9 = u7 - u6
      bjw = b(jw+j)
      bjo =  bjw
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      a(jh+j) = t8 - u11
      b(jh+j) = u8 + t11
      a(jw+j) = t8 + u11
      b(jw+j) = u8 - t11
      a(jm+j) = t9 - u10
      b(jm+j) = u9 + t10
      a(jr+j) = t9 + u10
      b(jr+j) = u9 - t10
*----------------------
      ajt = a(jt+j)
      t1 = aji + ajt
      ajs = a(js+j)
      t2 = ajn + ajs
      t3 = aji - ajt
      t4 = ajn - ajs
      ajx = a(jx+j)
      ajt =  ajx
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      ajp = a(jp+j)
      t7 = ajp - 0.25 * t5
      ax = ajp + t5
      t8 = t7 + t6
      t9 = t7 - t6
      a(jp+j) = ajd
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      a(jd+j) = ax
      bjt = b(jt+j)
      u1 = bji + bjt
      bjs = b(js+j)
      u2 = bjn + bjs
      u3 = bji - bjt
      u4 = bjn - bjs
      bjx = b(jx+j)
      bjt =  bjx
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      bjp = b(jp+j)
      u7 = bjp - 0.25 * u5
      bx = bjp + u5
      u8 = u7 + u6
      u9 = u7 - u6
      b(jp+j) = bjd
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      b(jd+j) = bx
      a(ji+j) = t8 - u11
      b(ji+j) = u8 + t11
      a(jx+j) = t8 + u11
      b(jx+j) = u8 - t11
      a(jn+j) = t9 - u10
      b(jn+j) = u9 + t10
      a(js+j) = t9 + u10
      b(js+j) = u9 - t10
*----------------------
      ajv = a(jv+j)
      ajy = a(jy+j)
      t1 = ajv + ajy
      t2 = ajo + ajt
      t3 = ajv - ajy
      t4 = ajo - ajt
      a(jv+j) = ajj
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      aju = a(ju+j)
      t7 = aju - 0.25 * t5
      ax = aju + t5
      t8 = t7 + t6
      t9 = t7 - t6
      a(ju+j) = aje
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      a(je+j) = ax
      bjv = b(jv+j)
      bjy = b(jy+j)
      u1 = bjv + bjy
      u2 = bjo + bjt
      u3 = bjv - bjy
      u4 = bjo - bjt
      b(jv+j) = bjj
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      bju = b(ju+j)
      u7 = bju - 0.25 * u5
      bx = bju + u5
      u8 = u7 + u6
      u9 = u7 - u6
      b(ju+j) = bje
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      b(je+j) = bx
      a(jj+j) = t8 - u11
      b(jj+j) = u8 + t11
      a(jy+j) = t8 + u11
      b(jy+j) = u8 - t11
      a(jo+j) = t9 - u10
      b(jo+j) = u9 + t10
      a(jt+j) = t9 + u10
      b(jt+j) = u9 - t10
      j = j + jump
  410 continue
*
      else
*
#ifdef OLD_CRAY
cdir$ ivdep, shortloop
#endif
      do 440 l = 1 , nvex
      ajb = a(jb+j)
      aje = a(je+j)
      t1 = ajb + aje
      ajc = a(jc+j)
      ajd = a(jd+j)
      t2 = ajc + ajd
      t3 = ajb - aje
      t4 = ajc - ajd
      ajf = a(jf+j)
      ajb =  ajf
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      aja = a(ja+j)
      t7 = aja - 0.25 * t5
      a(ja+j) = aja + t5
      t8 = t7 + t6
      t9 = t7 - t6
      ajk = a(jk+j)
      ajc =  ajk
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      bjb = b(jb+j)
      bje = b(je+j)
      u1 = bjb + bje
      bjc = b(jc+j)
      bjd = b(jd+j)
      u2 = bjc + bjd
      u3 = bjb - bje
      u4 = bjc - bjd
      bjf = b(jf+j)
      bjb =  bjf
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      bja = b(ja+j)
      u7 = bja - 0.25 * u5
      b(ja+j) = bja + u5
      u8 = u7 + u6
      u9 = u7 - u6
      bjk = b(jk+j)
      bjc =  bjk
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      a(jf+j) = co1*(t8-u11) - si1*(u8+t11)
      b(jf+j) = si1*(t8-u11) + co1*(u8+t11)
      aje =  co4*(t8+u11) - si4*(u8-t11)
      bje =  si4*(t8+u11) + co4*(u8-t11)
      a(jk+j) = co2*(t9-u10) - si2*(u9+t10)
      b(jk+j) = si2*(t9-u10) + co2*(u9+t10)
      ajd =  co3*(t9+u10) - si3*(u9-t10)
      bjd =  si3*(t9+u10) + co3*(u9-t10)
*----------------------
      ajg = a(jg+j)
      ajj = a(jj+j)
      t1 = ajg + ajj
      ajh = a(jh+j)
      aji = a(ji+j)
      t2 = ajh + aji
      t3 = ajg - ajj
      t4 = ajh - aji
      ajl = a(jl+j)
      ajh =  ajl
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      t7 = ajb - 0.25 * t5
      a(jb+j) = ajb + t5
      t8 = t7 + t6
      t9 = t7 - t6
      ajq = a(jq+j)
      aji =  ajq
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      bjg = b(jg+j)
      bjj = b(jj+j)
      u1 = bjg + bjj
      bjh = b(jh+j)
      bji = b(ji+j)
      u2 = bjh + bji
      u3 = bjg - bjj
      u4 = bjh - bji
      bjl = b(jl+j)
      bjh =  bjl
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      u7 = bjb - 0.25 * u5
      b(jb+j) = bjb + u5
      u8 = u7 + u6
      u9 = u7 - u6
      bjq = b(jq+j)
      bji =  bjq
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      a(jg+j) = co1*(t8-u11) - si1*(u8+t11)
      b(jg+j) = si1*(t8-u11) + co1*(u8+t11)
      ajj =  co4*(t8+u11) - si4*(u8-t11)
      bjj =  si4*(t8+u11) + co4*(u8-t11)
      a(jl+j) = co2*(t9-u10) - si2*(u9+t10)
      b(jl+j) = si2*(t9-u10) + co2*(u9+t10)
      a(jq+j) = co3*(t9+u10) - si3*(u9-t10)
      b(jq+j) = si3*(t9+u10) + co3*(u9-t10)
*----------------------
      ajo = a(jo+j)
      t1 = ajh + ajo
      ajm = a(jm+j)
      ajn = a(jn+j)
      t2 = ajm + ajn
      t3 = ajh - ajo
      t4 = ajm - ajn
      ajr = a(jr+j)
      ajn =  ajr
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      t7 = ajc - 0.25 * t5
      a(jc+j) = ajc + t5
      t8 = t7 + t6
      t9 = t7 - t6
      ajw = a(jw+j)
      ajo =  ajw
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      bjo = b(jo+j)
      u1 = bjh + bjo
      bjm = b(jm+j)
      bjn = b(jn+j)
      u2 = bjm + bjn
      u3 = bjh - bjo
      u4 = bjm - bjn
      bjr = b(jr+j)
      bjn =  bjr
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      u7 = bjc - 0.25 * u5
      b(jc+j) = bjc + u5
      u8 = u7 + u6
      u9 = u7 - u6
      bjw = b(jw+j)
      bjo =  bjw
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      a(jh+j) = co1*(t8-u11) - si1*(u8+t11)
      b(jh+j) = si1*(t8-u11) + co1*(u8+t11)
      a(jw+j) = co4*(t8+u11) - si4*(u8-t11)
      b(jw+j) = si4*(t8+u11) + co4*(u8-t11)
      a(jm+j) = co2*(t9-u10) - si2*(u9+t10)
      b(jm+j) = si2*(t9-u10) + co2*(u9+t10)
      a(jr+j) = co3*(t9+u10) - si3*(u9-t10)
      b(jr+j) = si3*(t9+u10) + co3*(u9-t10)
*----------------------
      ajt = a(jt+j)
      t1 = aji + ajt
      ajs = a(js+j)
      t2 = ajn + ajs
      t3 = aji - ajt
      t4 = ajn - ajs
      ajx = a(jx+j)
      ajt =  ajx
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      ajp = a(jp+j)
      t7 = ajp - 0.25 * t5
      ax = ajp + t5
      t8 = t7 + t6
      t9 = t7 - t6
      a(jp+j) = ajd
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      a(jd+j) = ax
      bjt = b(jt+j)
      u1 = bji + bjt
      bjs = b(js+j)
      u2 = bjn + bjs
      u3 = bji - bjt
      u4 = bjn - bjs
      bjx = b(jx+j)
      bjt =  bjx
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      bjp = b(jp+j)
      u7 = bjp - 0.25 * u5
      bx = bjp + u5
      u8 = u7 + u6
      u9 = u7 - u6
      b(jp+j) = bjd
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      b(jd+j) = bx
      a(ji+j) = co1*(t8-u11) - si1*(u8+t11)
      b(ji+j) = si1*(t8-u11) + co1*(u8+t11)
      a(jx+j) = co4*(t8+u11) - si4*(u8-t11)
      b(jx+j) = si4*(t8+u11) + co4*(u8-t11)
      a(jn+j) = co2*(t9-u10) - si2*(u9+t10)
      b(jn+j) = si2*(t9-u10) + co2*(u9+t10)
      a(js+j) = co3*(t9+u10) - si3*(u9-t10)
      b(js+j) = si3*(t9+u10) + co3*(u9-t10)
*----------------------
      ajv = a(jv+j)
      ajy = a(jy+j)
      t1 = ajv + ajy
      t2 = ajo + ajt
      t3 = ajv - ajy
      t4 = ajo - ajt
      a(jv+j) = ajj
      t5 = t1 + t2
      t6 = c1 * ( t1 - t2 )
      aju = a(ju+j)
      t7 = aju - 0.25 * t5
      ax = aju + t5
      t8 = t7 + t6
      t9 = t7 - t6
      a(ju+j) = aje
      t10 = c3 * t3 - c2 * t4
      t11 = c2 * t3 + c3 * t4
      a(je+j) = ax
      bjv = b(jv+j)
      bjy = b(jy+j)
      u1 = bjv + bjy
      u2 = bjo + bjt
      u3 = bjv - bjy
      u4 = bjo - bjt
      b(jv+j) = bjj
      u5 = u1 + u2
      u6 = c1 * ( u1 - u2 )
      bju = b(ju+j)
      u7 = bju - 0.25 * u5
      bx = bju + u5
      u8 = u7 + u6
      u9 = u7 - u6
      b(ju+j) = bje
      u10 = c3 * u3 - c2 * u4
      u11 = c2 * u3 + c3 * u4
      b(je+j) = bx
      a(jj+j) = co1*(t8-u11) - si1*(u8+t11)
      b(jj+j) = si1*(t8-u11) + co1*(u8+t11)
      a(jy+j) = co4*(t8+u11) - si4*(u8-t11)
      b(jy+j) = si4*(t8+u11) + co4*(u8-t11)
      a(jo+j) = co2*(t9-u10) - si2*(u9+t10)
      b(jo+j) = si2*(t9-u10) + co2*(u9+t10)
      a(jt+j) = co3*(t9+u10) - si3*(u9-t10)
      b(jt+j) = si3*(t9+u10) + co3*(u9-t10)
      j = j + jump
  440 continue
*
      endif
*
*-----(end of loop across transforms)
*
      ja = ja + jstepx
      if (ja.lt.istart) ja = ja + ninc
  445 continue
  450 continue
  460 continue
*-----( end of double loop for this k )
      kk = kk + 2*la
  470 continue
*-----( end of loop over values of k )
      la = 5*la
  480 continue
*-----( end of loop on type II radix-5 passes )
*-----( nvex transforms completed)
  490 continue
      istart = istart + nvex * jump
  500 continue
*-----( end of loop on blocks of transforms )
*
      return
      end subroutine gpfa5f

!*********************************************************************

      SUBROUTINE SETGPFA(TRIGS,maxtrigs,ntrigs,N)
*
      use precision, only : dp

      INTEGER maxtrigs,NJ(3),ntrigs
      REAL(dp) TRIGS(maxtrigs)

      integer       :: nn, n, ifac, ll, kk, ip, iq, ir,
     $                 i, ni, irot, kink, k
      real(dp)      :: angle, twopi, del
*
*     DECOMPOSE N INTO FACTORS 2,3,5
*     ------------------------------
      NN = N
      IFAC = 2
*
      DO 30 LL = 1 , 3
      KK = 0
   10 CONTINUE
      IF (MOD(NN,IFAC).NE.0) GO TO 20
      KK = KK + 1
      NN = NN / IFAC
      GO TO 10
   20 CONTINUE
      NJ(LL) = KK
      IFAC = IFAC + LL
   30 CONTINUE
*
      IF (NN.NE.1) THEN
         WRITE(6,40) N
   40    FORMAT(' *** WARNING!!!',I10,' IS NOT A LEGAL VALUE OF N ***')
         RETURN
      ENDIF
*
      IP = NJ(1)
      IQ = NJ(2)
      IR = NJ(3)
*
*     COMPUTE LIST OF ROTATED TWIDDLE FACTORS
*     ---------------------------------------
      NJ(1) = 2**IP
      NJ(2) = 3**IQ
      NJ(3) = 5**IR
*
      TWOPI = 4.0_dp * ASIN(1.0_dp)
      I = 1
*
      DO 60 LL = 1 , 3
      NI = NJ(LL)
      IF (NI.EQ.1) GO TO 60
*
      DEL = TWOPI / real(NI,kind=dp)
      IROT = N / NI
      KINK = MOD(IROT,NI)
      KK = 0
*
      DO 50 K = 1 , NI
      ANGLE = real(KK,kind=dp) * DEL
      if (i.lt.maxtrigs) then
        TRIGS(I) = COS(ANGLE)
        TRIGS(I+1) = SIN(ANGLE)
      endif
      I = I + 2
      KK = KK + KINK
      IF (KK.GT.NI) KK = KK - NI
   50 CONTINUE
   60 CONTINUE
      ntrigs = i
*
      RETURN
      END SUBROUTINE SETGPFA

      END MODULE fft1d

